# This script extracts data from SSA ASR HTML Tables 61-63, saves output as CSV

library(rvest)
library(tidyverse)
library(tools)

# List html files, tables 61-63
html_files <- list.files("html", recursive = TRUE) %>% 
  str_subset(paste0(".*(\\/SSDI_ASR_(61|62|63)_).*"))

# Function to extract data from HTML file, save as CSV
table6X_html_to_csv <- function(html_file) {
  # html_file <- html_files[1]
  
  # Age group or single year of age for each file
  age <- html_file %>% 
    str_extract("(?<=\\/SSDI_ASR_(61|62|63)_).*(?=[.]html)")
  
  # Parse HTML table
  x <- read_html(file.path("html", html_file)) %>% 
    html_table(header = FALSE, fill = TRUE) %>% 
    .[[1]] 
  
  # Rows corresponding to data for new group data
  panels <- which(x[,1]=="")
  group <- rep(x[panels, 2], diff(c(panels, nrow(x))) - 1)
  
  # Remove non-breaking spaces from group names
  showNonASCII(group)
  group <- str_replace_all(group, "[^\\x00-\\x7F]+", " ")
  stopifnot(length(showNonASCII(group)) == 0)
  
  # Rename columns
  colnames(x) <- c("Year",
                   "All decisions, Total",
                   "All decisions, Allowances",
                   "All decisions, Allowance rate (percent)",
                   "Decisions SS only, Total",
                   "Decisions SS only, Allowances",
                   "Decisions SS only, Allowance rate (percent)",
                   "Decisions SS + SSI, Total",
                   "Decisions SS + SSI, Allowances",
                   "Decisions SS + SSI, Allowance rate (percent)")
  
  # Select rows correponding to data; add table, group, and age info
  y <- x %>% 
    as_tibble() %>% 
    slice(-c(1:panels[1], panels[-1], nrow(x))) %>% 
    mutate(table = table) %>% 
    mutate(group = group) %>% 
    mutate(age = age) %>% 
    select(table, group, age, everything())
  
  # save as csv
  csv_file <- file.path("csv", paste0(file_path_sans_ext(html_file), ".csv"))
  dir.create(dirname(csv_file), showWarnings = FALSE, recursive = TRUE)
  write_csv(y, csv_file)
}

# Convert all HTML files to CSV
lapply(html_files, table6X_html_to_csv)




# EOF
